In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
Credit to Aleksey Bilogur notebooks for inspiration
Lets start by importing the data and having a glimpse of the dataset
In [2]:
data= pd.read_csv("../input/reddit_worldnews_start_to_2016-11-22.csv")
In [3]:
data.shape
Out[3]:
Columns in the dataset
In [4]:
data.columns
Out[4]:
The columns and data type
In [5]:
data.dtypes
Out[5]:
A glimpse of the dataset rows and columns
In [6]:
data.head()
Out[6]:
Analysis starts here
Posts with the most upvotes
In [8]:
data['up_votes'].sort_values(ascending=False).value_counts().head()
Out[8]:
In [9]:
data['down_votes'].sort_values(ascending=False).value_counts().head()
Out[9]:
we can see from above that they are no posts with downvotes
Let us check for most upoted headline
In [10]:
[title for title in data.sort_values('up_votes', ascending=False)['title'][:10]]
Out[10]:
Let us look at the mean number of posts over a rolling window of 120 days
In [11]:
import seaborn as sns
%matplotlib inline
sns.set_style("dark")
data.groupby('date_created')['up_votes'].mean().plot()
data.groupby('date_created')['up_votes'].mean().rolling(window=120).mean().plot(figsize= (12, 4))
Out[11]:
Let us check how many stories were deemed to be over 18
In [12]:
data.over_18.value_counts()
Out[12]:
320, let's explore the top 10 from over 18 stories
In [14]:
nsfwstory= data[data['over_18']== True]
[story for story in nsfwstory.sort_values('up_votes', ascending=False)['title'][:10]]
Out[14]:
But who is posting to r/worldnews?
In [15]:
attf= data.author.value_counts()[:20]
attf.plot.bar(figsize= (12, 4))
Out[15]:
It looks like davidreiss666 is the top poster. Let us check the top 10 posts
In [16]:
taff= data[data['author'] == "davidreiss666"]
#taff.sort_values('up_votes', ascending= False)['title'].value_counts()
[story for story in taff.sort_values('up_votes', ascending=False)['title'][:10]]
#htaff= taff[taff['up_votes'] > 5]
Out[16]:
Let us find out how many times Obama, Hillary and Trump were mentioned from 2008 - 2016
In [35]:
cand= ['Obama', 'Hillary', 'Trump']
for sole in cand:
print( data.title.str.contains(sole).value_counts(),sole )
In [36]:
#data.title.str.contains('Trump').value_counts()
In [37]:
from nltk import word_tokenize
tokens = data.title.map(word_tokenize)
def tell_me_about(x):
x_l = x.lower()
x_t = x.title()
return data.loc[tokens.map(lambda sent: x_l in sent or x_t in sent).values]
Have a glimpse of articles about Obama, Trump and Hilary
In [38]:
tell_me_about("Obama")['title'].values.tolist()[:10]
Out[38]:
In [39]:
tell_me_about("Hilary")['title'].values.tolist()[:10]
Out[39]:
In [40]:
tell_me_about("Donald")['title'].values.tolist()[:10]
Out[40]:
In [41]:
tell_me_about("Trump")['title'].values.tolist()[:10]
Out[41]:
And for fun let's look at some headlines related to Apple Inc
In [42]:
tell_me_about("Apple")['title'].values.tolist()[:10]
Out[42]:
In [ ]: